[1.1] Import the required packages
# required python libraries
import pandas as pd
import numpy as np
from joblib import dump
import seaborn as sns
import matplotlib.pyplot as plt
import time
# scikit-learn models and functions
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
# Logistic Regression Models
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_confusion_matrix
import warnings
warnings.filterwarnings('ignore')
[1.2] Load the training and test datasets
[2.2] Perform investigations to understand the training data
# import training & final test data
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')
# quick investigation of the data
pd.set_option("display.max_columns", None)
df_train.head()
# quick investigation of the data - rows, columns
df_train.shape
# quick investigation of the data - column info
df_train.info()
# quick investigation of the data - column statistics
df_train.describe()
# quick investigation of the data - check for Null/Nan values
print('Any NULL/NaN values?', df_train.isna().values.any())
df_train.isna()
Check which features are normally distributed v's skewed distribution. Features with skewed distributions may need transforming (future project stages).
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)
def draw_histograms(df, variables, n_rows, n_cols):
fig=plt.figure()
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
df[var_name].hist(bins=10,ax=ax)
ax.set_title(var_name)
#fig.tight_layout() # Improves appearance a bit.
#fig.set_dpi(150)
plt.show()
test = df_train
draw_histograms(test, test.columns, 3, 7)
This might be overkill - 21x21 columns - but I was curious how it would turn out, and if it would show anything interesting visually.
if 1==2:
df_plot = df_train.copy()
drop_cols = ['Id']
df_plot.drop(drop_cols, axis=1, inplace=True)
df_plot.columns = df_plot.columns.str.strip()
target = df_plot.pop('TARGET_5Yrs')
ax = sns.pairplot(df_plot) #, hue='Type'
plt.title('Pairwise relationships between the features')
plt.show()
Imbalanced data may be addressed in the next stage of this project due to time constraints for this deadline.
x = df_train.groupby('TARGET_5Yrs').size()
# Plot the counts for TARGET_5Yrs
plt.rcParams["figure.figsize"] = (5,4)
plt.bar(['0 = No','1 = Yes'], x)
plt.xlabel("TARGET_5Yrs")
plt.ylabel("# count")
plt.show()
A correlation matrix can help us quickly understand the correlations between each pair of variables. When two independent variables are highly correlated, this results in a problem known as multicollinearity and it can make it hard to interpret the results of the regression. One of the easiest ways to detect a potential multicollinearity problem is to look at a correlation matrix and visually check whether any of the variables are highly correlated with each other.
import seaborn as sns
import matplotlib.pyplot as plt
correlation_matrix = pd.DataFrame(df_train.iloc[:,1:-1]).corr()
plt.figure(figsize=(12,10))
ax = sns.heatmap(correlation_matrix, vmax=1, square=True, annot=True, fmt='.2f', cmap ='GnBu', cbar_kws={"shrink": .7}, robust=True)
#plt.xticks(np.arange(len(labels)), rotation=45)
#plt.yticks(np.arange(len(labels)), rotation=45)
plt.title('Correlation matrix between the features', fontsize=20)
plt.show()
These values in the covariance matrix show the distribution magnitude and direction of multivariate data in multidimensional space of each pair of variables.
Covariance indicates the relationship of two variables whenever one variable changes. If an increase in one variable results in an increase in the other variable, both variables are said to have a positive covariance. Decreases in one variable also cause a decrease in the other.
You can use the covariance to determine the direction of a linear relationship between two variables as follows:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# capture labels first
labels = df_train.iloc[:,1:-1].columns
# scaling the data vastly improves the covariance matrix results
scaler = StandardScaler()
df_scale = scaler.fit_transform(df_train.iloc[:,1:-1])
# transpose 8000x19 row/cols to 19x8000 row/cols
data = df_scale.transpose()
# calculate covariance matrix of all features
covMatrix = np.cov(data, bias=True)
plt.figure(figsize=(12,10))
sns.heatmap(covMatrix, annot=True, fmt='.2g', cbar_kws={"shrink": .7})
plt.xticks(np.arange(len(labels)), labels=labels, rotation=90)
plt.yticks(np.arange(len(labels)), labels=labels, rotation=0)
plt.title('Covariance matrix between the features', fontsize=20)
plt.show()
[3.1] Copy data for transformation for modelling steps
# Create a copy of df and save it into a variable called df_cleaned
df_cleaned = df_train.copy()
df_clean_test = df_test.copy()
[3.2] We need to drop the Id column as this is irrelevant for modelling
# Drop columns 'Id'
drop_cols = ['Id']
df_cleaned.drop(drop_cols, axis=1, inplace=True)
df_clean_test.drop(drop_cols, axis=1, inplace=True)
[3.3] Remove leading and trailing space from the column names
df_cleaned.columns = df_cleaned.columns.str.strip()
df_clean_test.columns = df_clean_test.columns.str.strip()
[3.4] Extract the column TARGET_5Yrs and save it into variable called target
# Extract the column TARGET_5Yrs and save it into variable called target
target = df_cleaned.pop('TARGET_5Yrs')
print('df_cleaned.shape',df_cleaned.shape,'\n')
# we will need labels later for plotting results
labels = df_cleaned.columns
[3.5] Import StandardScaler from sklearn.preprocessing
[3.6] Instantiate the StandardScaler
[3.7] Fit and apply the scaling on df_cleaned
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_cleaned = scaler.fit_transform(df_cleaned)
df_clean_test = scaler.fit_transform(df_clean_test)
[3.8] Import dump from joblib
[3.9] Save the scaler into the folder models and call the file scaler.joblib
from joblib import dump
dump(scaler, '../models/scaler.joblib')
[3.10] Import train_test_split from sklearn.model_selection
[3.11] Split randomly the dataset with random_state=8 into 2 different sets: data (80%) and test (20%)
[3.12] Split the remaining data (80%) randomly with random_state=8 into 2 different sets: training (80%) and validation (20%)
from sklearn.model_selection import train_test_split
# Split randomly the dataset with random_state=8 into 2 different sets: data (80%) and test (20%)
X_data, X_test, y_data, y_test = train_test_split (df_cleaned, target, test_size=0.2, random_state=8)
# Split the remaining data (80%) randomly with random_state=8 into 2 different sets: training (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=8)
[3.13] Save the different sets in the folder data/processed
# save V2 for reduced features
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val', X_val)
np.save('../data/processed/X_test', X_test)
np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val', y_val)
np.save('../data/processed/y_test', y_test)
# save the final Test data for submitting model results to Kaggle
np.save('../data/processed/final_test', df_clean_test)
[4.1] Calculate the average of the target variable for the training set and save it into a variable called y_mean
[4.2] Create a numpy array called y_base of dimensions (len(y_train), 1) filled with this value
# Calculate the average of the target variable for the training set
y_mean = y_train.mean()
print('y_mean',y_mean)
# Create a numpy array called `y_base` of dimensions (len(y_train), 1) filled with this value
y_base = np.full((len(y_train), 1), y_mean)
[4.3] Import the MSE and MAE metrics from sklearn
[4.4] Display the RMSE and MAE scores of this baseline model
# Import the MSE and MAE metrics from sklearn
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
# Display the RMSE and MAE scores of this baseline model
print('1. Baseline model scores - training data')
print('RMSE:',mse(y_train, y_base, squared=False))
print('MAE: ',mae(y_train, y_base))
Note: I have not used 5.1 or 5.2 in this experiment. However, they are here for experiment 2 future work.
I have incorporated 5.3 and 5.4 into the feature selection for Tests 2 and 3 in this experiment.
Linear models penalized with the L1 norm have sparse solutions: many of their estimated coefficients are zero. When the goal is to reduce the dimensionality of the data to use with another classifier, they can be used along with SelectFromModel to select the non-zero coefficients. In particular, sparse estimators useful for this purpose are the Lasso for regression, and of LogisticRegression and LinearSVC for classification. https://scikit-learn.org/stable/modules/feature_selection.html#l1-based-feature-selection
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
X, y = X_data, y_data
print('LinearSVC')
print('X ',X.shape)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
print('X_new',X_new.shape)
Tree-based estimators can be used to compute impurity-based feature importances, which in turn can be used to discard irrelevant features (when coupled with the SelectFromModel meta-transformer). https://scikit-learn.org/stable/modules/feature_selection.html#tree-based-feature-selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
X, y = X_data, y_data
print('ExtraTreesClassifier')
print(X.shape)
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
print(clf.feature_importances_)
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
print(X_new.shape)
Select features according to the k highest scores.
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
# feature selection
f_selector = SelectKBest(score_func=f_regression, k='all')
# learn relationship from training data
f_selector.fit(X_train, y_train)
# transform train input data
X_train_fs = f_selector.transform(X_train)
# transform test input data
X_test_fs = f_selector.transform(X_test)
# Plot the scores for the features
plt.rcParams["figure.figsize"] = (7,5)
plt.bar([i for i in range(len(f_selector.scores_))], f_selector.scores_)
plt.xticks(np.arange(len(labels)),labels=labels, rotation=45)
plt.xlabel("feature index")
plt.ylabel("F-value (transformed from the correlation values)")
plt.show()
Estimate mutual information for a continuous target variable.
Mutual information (MI) [1] between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
# feature selection
f_selector = SelectKBest(score_func=mutual_info_regression, k='all')
# learn relationship from training data
f_selector.fit(X_train, y_train)
# transform train input data
X_train_fs = f_selector.transform(X_train)
# transform test input data
X_test_fs = f_selector.transform(X_test)
# Plot the scores for the features
plt.rcParams["figure.figsize"] = (7,5)
plt.bar([i for i in range(len(f_selector.scores_))], f_selector.scores_)
plt.xlabel("feature index")
plt.xticks(np.arange(len(labels)),labels=labels, rotation=45)
plt.ylabel("Estimated MI value")
plt.show()
Training using Neural Network
https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
[6.1] Train the model
from sklearn.neural_network import MLPClassifier
rerun_no = 1
classifier_name="NeuralNetworkMLP"
random_state=2
max_iter=300
activation='logistic'
solver='adam' # sgd, adam (default)
alpha=0.01 # 0.0001 default
batch_size=100
#--------------------------------------------------------------------------
t_start = time.process_time()
# fit model
classifier = MLPClassifier(activation=activation, solver=solver, alpha=alpha,
batch_size=batch_size, random_state=random_state).fit(X_train, y_train)
t_end = time.process_time()
t_diff = t_end - t_start
#--------------------------------------------------------------------------
# Compare accuracy of the given test data
y_base = np.full((len(y_train), 1), y_train.mode())
print("Compare accuracy between data sets")
print("Baseline: ",accuracy_score(y_train, y_base))
print("Train data: ",classifier.score(X_train, y_train))
print("Validation: ",classifier.score(X_val, y_val))
[6.2] Save the fitted model into the folder 'models'
#--------------------------------------------------------------------------
# Save the fitted model into the folder 'models', named for each classifier
dump(classifier, '../models/r{d}_{c}.joblib'.format(d=rerun_no, c=classifier_name))
[6.3] Model evaluation and performance
from sklearn.metrics import roc_curve
# predict probabilities
pred_prob = classifier.predict_proba(X_test)
# roc curve for models
fpr, tpr, thresh = roc_curve(y_test, pred_prob[:,1], pos_label=1)
# roc curve for tpr = fpr
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
from sklearn.metrics import roc_auc_score
# auc scores
auc_score = roc_auc_score(y_test, pred_prob[:,1])
print('auc_score',auc_score)
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
print('Confusion matrix - training data')
plot_confusion_matrix(classifier, X_train, y_train, cmap=plt.cm.Blues, normalize='true');
print('Confusion matrix - validation data')
plot_confusion_matrix(classifier, X_val, y_val, cmap=plt.cm.Blues, normalize='true');
https://www.kaggle.com/c/uts-advdsi-22-02-nba-career-prediction/overview
#save the final "test" prediction probabilities for Kaggle
y_final_preds = classifier.predict_proba(df_clean_test)
# combine final "test" Id column with prediction probabilities column (cover to dataframe first)
frames = [df_test.iloc[:,0], pd.DataFrame(y_final_preds[:,1])]
result = pd.concat(frames, axis=1)
result.columns = ['Id','tmp']
result['TARGET_5Yrs'] = [round(num, 2) for num in result['tmp']]
result.drop(['tmp'], axis=1, inplace=True)
#--------------------------------------------------------------------------
# Save the final predictions for submission to Kaggle
result.to_csv('../data/processed/group1_r{d}_{c}.csv'.format(d=rerun_no, c=classifier_name), index=False)
print('kaggle results saved ../data/processed/group1_r{d}_{c}.csv'.format(d=rerun_no, c=classifier_name))
[7.1] Add changes to git staging area
[7.2] Create the snapshot of your repository and add a description
[7.3] Push your snapshot to Github
# Code saved here for easy reference, but do not run as code
# https://github.com/CazMayhem/adv_dsi_AT1
"""
# Add changes to git staging area
git add .
# Create the snapshot of your repository and add a description
git commit -m "assignement & kaggle submission"
# Push your snapshot to Github
git push https://******@github.com/CazMayhem/adv_dsi_AT1.git
"""
[7.4] Close Jupyter Lab with control (command) + c